9 Association testing with simulated phenotypes
9.1 Read in F2 recombination blocks
9.1.1 Read data
in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks/20211027"
in_files = list.files(in_dir, pattern = "F2_", full.names = T)
# Read into list
data_list = purrr::map(in_files, function(FILE){
out = readr::read_tsv(FILE,
col_types = "ciiidii")
})
# Set names as bin length
names(data_list) = basename(in_files) %>%
stringr::str_split("_", simplify = T) %>%
subset(select = 2) %>%
stringr::str_remove(".txt")
# Reorder
data_list = data_list[order(as.numeric(names(data_list)))]
counter = 0
df_list = purrr::map(data_list, function(data){
counter <<- counter + 1
# set bin length
bin_length = as.numeric(names(data_list)[counter])
# add bin start and end coordinates
df = data %>%
dplyr::mutate(SAMPLE = basename(sample) %>%
stringr::str_remove(".txt") %>%
as.numeric(.),
BIN_START = (bin - 1) * bin_length + 1,
BIN_END = bin * bin_length) %>%
# recode state to make 0 == "Cab"
dplyr::mutate(STATE = dplyr::recode(state,
`0` = 2,
`1` = 1,
`2` = 0)) %>%
dplyr::select(SAMPLE, CHROM = chr, BIN = bin, BIN_START, BIN_END, STATE)
return(df)
})9.1.2 How many possible blocks have at least one call?
9.1.2.1 Count total existing bins
bin_lengths = as.integer(names(df_list))
names(bin_lengths) = bin_lengths
n_bins = purrr::map(bin_lengths, function(BIN_LENGTH){
purrr::map_int(med_chr_lens$end, function(CHR_END){
out = seq(from = 1, to = CHR_END, by = BIN_LENGTH)
return(length(out))
})
})
n_bins_total = purrr::map_int(n_bins, sum)9.1.2.2 Proportion of total bins with calls
# Get total number of samples
n_samples = df_list$`5000`$SAMPLE %>%
unique(.) %>%
length(.)
# Build DF of bins
n_bins_df = purrr::map_dfr(1:length(df_list), function(COUNTER){
# Bin length
bin_length = as.numeric(names(df_list)[COUNTER])
# Number of total bins
n_bins = n_bins_total[COUNTER]
# Number of bins with calls
n_bins_with_calls = df_list[[COUNTER]] %>%
dplyr::distinct(CHROM, BIN_START) %>%
nrow(.)
# Number of bins with calls for each
n_bins_no_missing = df_list[[COUNTER]] %>%
dplyr::count(CHROM, BIN_START) %>%
dplyr::filter(n == n_samples) %>%
nrow(.)
# Build final data frame
out = tibble::tibble(BIN_LENGTH = bin_length,
N_BINS = n_bins,
N_BINS_WITH_CALLS = n_bins_with_calls,
N_BINS_NO_MISSING = n_bins_no_missing) %>%
dplyr::mutate(PROP_BINS_WITH_CALLS = N_BINS_WITH_CALLS / N_BINS,
PROP_BINS_NO_MISSING = N_BINS_NO_MISSING / N_BINS )
return(out)
})
DT::datatable(n_bins_df)9.1.3 Merge recombination blocks
9.1.3.1 List with final recoded genotypes (including NAs)
gt_list = purrr::map(df_list, function(BIN_LENGTH_DF){
# Widen data frame
gt_final = BIN_LENGTH_DF %>%
tidyr::pivot_wider(names_from = SAMPLE, values_from = STATE)
# Pull out matrix of genotypes
gt_mat = as.matrix(gt_final[, 5:ncol(gt_final)])
# Get indexes of loci with > 1 genotype
bins_to_keep = logical()
for (ROW in 1:nrow(gt_mat)){
# get unique values in each row
out = unique(gt_mat[ROW, ])
# remove NAs
out = out[!is.na(out)]
# if more than one value, return TRUE
if (length(out) > 1) {
bins_to_keep[ROW] = TRUE
}
# if just one value (i.e. if all samples are the same genotype at that locus), return false
else {
bins_to_keep[ROW] = FALSE
}
}
# filter gt_final
gt_filt = gt_final %>%
dplyr::filter(bins_to_keep) %>%
# recode genotypes to -1, 0, 1
dplyr::mutate(dplyr::across(-c("CHROM", "BIN", "BIN_START", "BIN_END"),
~dplyr::recode(.x,
`0` = -1,
`1` = 0,
`2` = 1))) %>%
# order
dplyr::arrange(CHROM, BIN_START)
return(gt_filt)
})
# Show first 10 columns
gt_list$`20000`[, 1:10] %>%
head(.) %>%
DT::datatable(.) 9.1.3.2 List with final recoded genotypes (complete cases only)
gt_nomiss_list = purrr::map(gt_list, function(BIN_LENGTH_DF){
BIN_LENGTH_DF %>%
dplyr::filter(complete.cases(.))
})9.2 Simulate phenotype
9.2.1 Extract samples of genotypes
# Set directory
out_dir_test = file.path(lts_dir, "association_testing/20211027_test")These must be written to a file because PhenotypeSimulator reads delimited genotypes from files.
# Get random 10 loci
set.seed(10)
n_loci = 10
# NOTE: PhenotypeSimulator::readStandardGenotypes states that the genotype file must
# delim: a [delimter]-delimited file of [(NrSNPs+1) x (NrSamples+1)] genotypes with the snpIDs in the first column and the sampleIDs in the first row and genotypes encoded as numbers between 0 and 2 representing the (posterior) mean genotype, or dosage of the minor allele.
sample_gts = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
# Pull out random SNPs
snp_sample = gt_nomiss_list[[COUNTER]] %>%
dplyr::slice_sample(n = n_loci) %>%
dplyr::arrange(CHROM, BIN_START) %>%
# create locus column
dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
# remove superfluous columns
dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>%
# recode back to 0,1,2
dplyr::mutate(dplyr::across(-LOCUS,
~dplyr::recode(.x,
`-1` = 0,
`0` = 1,
`1` = 2))) %>%
# reorder columns
dplyr::select(LOCUS, everything())
})
names(sample_gts) = names(gt_nomiss_list)
purrr::map(seq_along(sample_gts), function(COUNTER){
# save to file
bin_length = names(sample_gts)[COUNTER]
out_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
readr::write_csv(sample_gts[[COUNTER]], out_file)
})
saveRDS(sample_gts, file.path(out_dir_test, "target_loci/sample_gts.rds"))9.2.2 Simulate phenotype
sim_path = file.path(out_dir_test, "simulated_phenotypes/20211103_sim_phenos.rds")
set.seed(5671)
# N samples
N = n_samples
# N phenotypes
P = 1
# Proportion of total genetic variance
genVar = 0.5
# Proportion of genetic variance of genetic variant effects
h2s = 1
# Proportion of total noise variance
noiseVar = 0.5
# Proportion of noise variance of observational noise effects
phi = 1
sim_phenos = purrr::map(seq_along(sample_gts), function(COUNTER){
# get sample file path
bin_length = names(sample_gts)[COUNTER]
gt_sample_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
sim_pheno = PhenotypeSimulator::runSimulation(N = N, P = P,
genVar = genVar, h2s = h2s,
noiseVar = noiseVar, phi = phi,
cNrSNP = n_loci,
genotypefile = gt_sample_file,
format = "delim",
genoDelimiter = ",")
return(sim_pheno)
})
names(sim_phenos) = names(sample_gts)
saveRDS(sim_phenos, sim_path)
# Write as .xlsx to use in same Snakemake code as true GWLS
lapply(seq_along(sim_phenos), function(COUNTER){
out = tibble::tibble(fish = colnames(sample_gts[[COUNTER]])[-1],
Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y)
# set path for output
out_file = file.path(lts_dir,
"association_testing/20211027_test/simulated_phenotypes",
paste(names(sim_phenos)[COUNTER], ".xlsx", sep = ""))
# write to file
openxlsx::write.xlsx(out, out_file, overwrite = T)
})9.3 Reformat genotypes for GridLMM
9.3.1 Complete cases
final_nomiss = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
out = list()
# Genotypes
out[["genotypes"]] = gt_nomiss_list[[COUNTER]] %>%
dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>%
# convert to matrix
as.matrix(.) %>%
# transpose to put samples as rows
t(.) %>%
# convert to data frame
as.data.frame(.)
# Positions
out[["positions"]] = gt_nomiss_list[[COUNTER]] %>%
dplyr::select(CHROM, BIN_START, BIN_END)
# Phenotypes
out[["phenotypes"]] = data.frame(fish = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
as.numeric()
)
return(out)
})
names(final_nomiss) = names(gt_nomiss_list)9.3.2 With NAs
final_wimiss = purrr::map(seq_along(gt_list), function(COUNTER){
out = list()
# Genotypes
out[["genotypes"]] = gt_list[[COUNTER]] %>%
dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>%
# convert to matrix
as.matrix(.) %>%
# transpose to put samples as rows
t(.) %>%
# convert to data frame
as.data.frame(.)
# Positions
out[["positions"]] = gt_list[[COUNTER]] %>%
dplyr::select(CHROM, BIN_START, BIN_END)
# Phenotypes
out[["phenotypes"]] = data.frame(SAMPLE = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
as.numeric()
)
return(out)
})
names(final_wimiss) = names(gt_list)9.4 Test GridLMM
9.4.1 No missing
9.4.1.1 Run GWAS
test_out_nomiss = file.path(out_dir_test, "gwls_results", "gwas_results_nomiss.rds")9.4.1.2 Plot
# Create custom Manhattan plot
test_man_nomiss = lapply(seq_along(gwas_tests_nomiss), function(COUNTER){
# Get bin length
BIN_LENGTH = names(gwas_tests_nomiss)[COUNTER] %>%
as.numeric(.)
# Clean data frame
test_results = gwas_tests_nomiss[[COUNTER]]$results %>%
dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>%
# add x-coord
dplyr::mutate(X_COORD = pos + TOT) %>%
# change column names
dplyr::rename(CHROM = Chr, BIN_START = pos) %>%
# add BIN_END
dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>%
# add locus
dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
# target or not
dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
"yes",
"no"),
TARGET = factor(TARGET, levels = c("yes", "no"))) %>%
# create vector of colours
dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
gtools::even(CHROM) ~ names(gwas_pal)[2],
gtools::odd(CHROM) ~ names(gwas_pal)[3]),
# order so that `target` is plotted last, at the front
COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
SHAPE = dplyr::if_else(TARGET == "yes",
18,
20),
SIZE = dplyr::if_else(TARGET == "yes",
1,
0.5),
ALPHA = dplyr::if_else(TARGET == "yes",
1,
0.5)
)
# Plot
p1 = test_results %>%
ggplot(aes(x = X_COORD,
y = -log10(p_value_REML),
colour = COLOUR,
shape = SHAPE,
size = SIZE,
alpha = ALPHA,
label = CHROM,
label2 = BIN_START,
label3 = BIN_END)) +
geom_point() +
aes(group = rev(TARGET)) +
scale_color_manual(values = gwas_pal) +
scale_shape_identity() +
scale_size_identity() +
scale_alpha_identity() +
scale_x_continuous(breaks = med_chr_lens$MID_TOT,
labels = med_chr_lens$chr) +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
) +
guides(colour = "none") +
ggtitle(paste("Bin length:", BIN_LENGTH)) +
xlab("Chromosome") +
ylab("-log10(p-value)") +
geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
return(out)
})9.4.2 Include loci with missing genotypes
9.4.2.1 Run GWAS
test_out_wimiss = file.path(out_dir_test, "gwls_results", "gwas_results_wimiss.rds")9.4.2.3 Plot
# Create custom Manhattan plot
gwas_pal = c("#2B2D42", "#F7B267", "#F25C54")
names(gwas_pal) = c("target", "even chr", "odd chr")
significance_line = 3.6
suggestive_line = 2.9
test_man_wimiss = purrr::map(seq_along(gwas_tests_wimiss), function(COUNTER){
# Get bin length
BIN_LENGTH = names(gwas_tests_wimiss)[COUNTER] %>%
as.numeric(.)
# Clean data frame
test_results = gwas_tests_wimiss[[COUNTER]]$results %>%
dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>%
# add x-coord
dplyr::mutate(X_COORD = pos + TOT) %>%
# change column names
dplyr::rename(CHROM = Chr, BIN_START = pos) %>%
# add BIN_END
dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>%
# add locus
dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
# target or not
dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
"yes",
"no"),
TARGET = factor(TARGET, levels = c("yes", "no"))) %>%
# create vector of colours
dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
gtools::even(CHROM) ~ names(gwas_pal)[2],
gtools::odd(CHROM) ~ names(gwas_pal)[3]),
# order so that `target` is plotted last, at the front
COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
SHAPE = dplyr::if_else(TARGET == "yes",
18,
20),
SIZE = dplyr::if_else(TARGET == "yes",
1,
0.5),
ALPHA = dplyr::if_else(TARGET == "yes",
1,
0.5)
)
# Plot
p1 = test_results %>%
ggplot(aes(x = X_COORD,
y = -log10(p_value_REML),
colour = COLOUR,
shape = SHAPE,
size = SIZE,
alpha = ALPHA,
label = CHROM,
label2 = BIN_START,
label3 = BIN_END)) +
geom_point() +
aes(group = rev(TARGET)) +
scale_color_manual(values = gwas_pal) +
scale_shape_identity() +
scale_size_identity() +
scale_alpha_identity() +
scale_x_continuous(breaks = med_chr_lens$MID_TOT,
labels = med_chr_lens$chr) +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
) +
guides(colour = "none") +
ggtitle(paste("Bin length:", BIN_LENGTH)) +
xlab("Chromosome") +
ylab("-log10(p-value)") +
geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
return(out)
})
test_man_wimiss[[1]]9.5 Get significance levels from permutations
perm_file = here::here("data/20211109_permutation_mins.csv")
PERMUTATIONS_DIR = file.path(lts_dir, "association_testing/20211109_permutations")
SITE_FILTERS = c("all_sites", "no_repeat_sites", "no_repeat_reads"); names(SITE_FILTERS) = SITE_FILTERS
TARGET_PHENOS = c("mean", "intercept"); names(TARGET_PHENOS) = TARGET_PHENOS
BIN_LENGTHS = c(5000, 10000, 15000, 20000); names(BIN_LENGTHS) = BIN_LENGTHS
N_PERMUTATIONS = 1:10
perms_df = purrr::map_dfr(SITE_FILTERS, function(SITE_FILTER){
purrr::map_dfr(TARGET_PHENOS, function(TARGET_PHENO){
purrr::map_dfr(BIN_LENGTHS, function(BIN_LENGTH){
purrr::map_dfr(N_PERMUTATIONS, function(N_PERM){
in_list = readRDS(file.path(PERMUTATIONS_DIR, SITE_FILTER, TARGET_PHENO, BIN_LENGTH, paste(N_PERM, ".rds", sep = "")))
# Pull out minimum
out_df = tibble::tibble(MIN_P = in_list$results$p_value_REML %>%
min()
)
}, .id = "PERMUTATION")
}, .id = "BIN_LENGTH")
}, .id = "TARGET_PHENO")
}, .id = "SITE_FILTER")
# Get minimum
perms_df_mins = perms_df %>%
dplyr::group_by(SITE_FILTER, TARGET_PHENO, BIN_LENGTH) %>%
dplyr::summarise(MIN_P = min(MIN_P))
readr::write_csv(perms_df_mins,
perm_file)
perms_df_mins %>%
DT::datatable()